{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1645fc37-fa48-4bcf-a890-258d6a040c8a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\Lib\\site-packages\\paddle\\utils\\cpp_extension\\extension_utils.py:711: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md\n",
      "  warnings.warn(warning_message)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from collections import Counter\n",
    "import paddlehub as hub\n",
    "import paddle\n",
    "from sklearn.model_selection import train_test_split\n",
    "from paddlehub.datasets.base_nlp_dataset import TextClassificationDataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b1dffb59-bca4-4140-a4db-5629c7b26191",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>num</th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>瞧着这小样儿，突然间感动了，爸妈怎么把我拉扯大的呀～～～</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>习惯和凑和的力量何其强大，革命总是被逼到无法接受的程度以后才会发生，这时仍要忍受数小时的漫长...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>5.尽量在7点前起床，这样有利于排出宿便；</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>原来不知道在何时开始，我已经不再是儿童了，不再是那个可以撒娇的小孩子了！</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>宝贝，节日快乐。</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>我们还要这样的阳光吗？</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>据说济南最低温度24度，可今天青岛最高才21度……</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>据说济南最低温度24度，可今天青岛最高才21度……</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>这才是你的舞台啊！！！</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>享受每一刻的感觉，欣赏每一处的风景，这就是人生。</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   num                                               text  label\n",
       "0    1                       瞧着这小样儿，突然间感动了，爸妈怎么把我拉扯大的呀～～～    3.0\n",
       "1    2  习惯和凑和的力量何其强大，革命总是被逼到无法接受的程度以后才会发生，这时仍要忍受数小时的漫长...    0.0\n",
       "2    3                              5.尽量在7点前起床，这样有利于排出宿便；    0.0\n",
       "3    4               原来不知道在何时开始，我已经不再是儿童了，不再是那个可以撒娇的小孩子了！    2.0\n",
       "4    5                                           宝贝，节日快乐。    1.0\n",
       "5    6                                        我们还要这样的阳光吗？    5.0\n",
       "6    7                          据说济南最低温度24度，可今天青岛最高才21度……    6.0\n",
       "7    8                          据说济南最低温度24度，可今天青岛最高才21度……    6.0\n",
       "8    9                                        这才是你的舞台啊！！！    5.0\n",
       "9   10                           享受每一刻的感觉，欣赏每一处的风景，这就是人生。    0.0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=pd.read_excel('moods_classify8_unprocessed.xlsx')\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "09d5da8e-ff00-430d-b4d6-9e147a9a922c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 26462 entries, 0 to 26461\n",
      "Data columns (total 3 columns):\n",
      " #   Column  Non-Null Count  Dtype  \n",
      "---  ------  --------------  -----  \n",
      " 0   num     26462 non-null  int64  \n",
      " 1   text    26432 non-null  object \n",
      " 2   label   26455 non-null  float64\n",
      "dtypes: float64(1), int64(1), object(1)\n",
      "memory usage: 620.3+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "28e0fe95-ff20-4206-a8ec-6243335cdeff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "num      False\n",
       "text      True\n",
       "label     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ba851891-482a-4431-a81a-e625cb661d50",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>num</th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>93</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>367</th>\n",
       "      <td>368</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1101</th>\n",
       "      <td>1102</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1112</th>\n",
       "      <td>1113</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1161</th>\n",
       "      <td>1162</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1208</th>\n",
       "      <td>1209</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1249</th>\n",
       "      <td>1250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1694</th>\n",
       "      <td>1695</td>\n",
       "      <td>我想说中国皇帝有上千个老婆呢。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1705</th>\n",
       "      <td>1706</td>\n",
       "      <td>2、总是和别人比较。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2141</th>\n",
       "      <td>2142</td>\n",
       "      <td>在最艰难的时刻，更要相信自己手中握有最好的猎枪。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2188</th>\n",
       "      <td>2189</td>\n",
       "      <td>今日话题，走着：全国高考今日拉开帷幕，各地高考作文题陆续公布。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2278</th>\n",
       "      <td>2279</td>\n",
       "      <td>今天六一节，感觉没有什么不同的，很多朋友互相问候节日快乐！</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2360</th>\n",
       "      <td>2361</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2361</th>\n",
       "      <td>2362</td>\n",
       "      <td>禁买令最大的问题在于地域性歧视方面，一是户籍问题，而是限家奴而不限友邦。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2362</th>\n",
       "      <td>2363</td>\n",
       "      <td>蔡健雅唱过，得不到的就更加爱，太容易来的就不理睬。</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4699</th>\n",
       "      <td>4700</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4700</th>\n",
       "      <td>4701</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4841</th>\n",
       "      <td>4842</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4855</th>\n",
       "      <td>4856</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4910</th>\n",
       "      <td>4911</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4943</th>\n",
       "      <td>4944</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5005</th>\n",
       "      <td>5006</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5069</th>\n",
       "      <td>5070</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10870</th>\n",
       "      <td>10871</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10923</th>\n",
       "      <td>10924</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11108</th>\n",
       "      <td>11109</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11672</th>\n",
       "      <td>11673</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12324</th>\n",
       "      <td>12325</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13252</th>\n",
       "      <td>13253</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14592</th>\n",
       "      <td>14593</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20281</th>\n",
       "      <td>20282</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20380</th>\n",
       "      <td>20381</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20478</th>\n",
       "      <td>20479</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20569</th>\n",
       "      <td>20570</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23780</th>\n",
       "      <td>23781</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         num                                  text  label\n",
       "17        18                                   NaN    0.0\n",
       "47        48                                   NaN    0.0\n",
       "92        93                                   NaN    0.0\n",
       "367      368                                   NaN    0.0\n",
       "1101    1102                                   NaN    0.0\n",
       "1112    1113                                   NaN    0.0\n",
       "1161    1162                                   NaN    0.0\n",
       "1208    1209                                   NaN    0.0\n",
       "1249    1250                                   NaN    0.0\n",
       "1694    1695                       我想说中国皇帝有上千个老婆呢。    NaN\n",
       "1705    1706                            2、总是和别人比较。    NaN\n",
       "2141    2142              在最艰难的时刻，更要相信自己手中握有最好的猎枪。    NaN\n",
       "2188    2189       今日话题，走着：全国高考今日拉开帷幕，各地高考作文题陆续公布。    NaN\n",
       "2278    2279         今天六一节，感觉没有什么不同的，很多朋友互相问候节日快乐！    NaN\n",
       "2360    2361                                   NaN    0.0\n",
       "2361    2362  禁买令最大的问题在于地域性歧视方面，一是户籍问题，而是限家奴而不限友邦。    NaN\n",
       "2362    2363             蔡健雅唱过，得不到的就更加爱，太容易来的就不理睬。    NaN\n",
       "4699    4700                                   NaN    0.0\n",
       "4700    4701                                   NaN    0.0\n",
       "4841    4842                                   NaN    0.0\n",
       "4855    4856                                   NaN    0.0\n",
       "4910    4911                                   NaN    0.0\n",
       "4943    4944                                   NaN    0.0\n",
       "5005    5006                                   NaN    0.0\n",
       "5069    5070                                   NaN    0.0\n",
       "10870  10871                                   NaN    0.0\n",
       "10923  10924                                   NaN    1.0\n",
       "11108  11109                                   NaN    5.0\n",
       "11672  11673                                   NaN    0.0\n",
       "12324  12325                                   NaN    0.0\n",
       "13252  13253                                   NaN    5.0\n",
       "14592  14593                                   NaN    5.0\n",
       "20281  20282                                   NaN    0.0\n",
       "20380  20381                                   NaN    0.0\n",
       "20478  20479                                   NaN    0.0\n",
       "20569  20570                                   NaN    0.0\n",
       "23780  23781                                   NaN    5.0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.isnull().values==True]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5f90f467-4b0c-4b1b-88f0-91e7373dc5c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "num      False\n",
       "text     False\n",
       "label    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna(subset=['text','label'],axis=0,how='any',inplace=True)\n",
    "df.isnull().any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1c1ca533-5865-4714-9dfe-7fe83d8a3d52",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>num</th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>据说济南最低温度24度，可今天青岛最高才21度……</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>110</td>\n",
       "      <td>地动山摇！</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154</th>\n",
       "      <td>155</td>\n",
       "      <td>每一个人应该经常问自己，我们为国为家做过些值得为之骄傲的贡献吗？</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>200</th>\n",
       "      <td>201</td>\n",
       "      <td>一半在尘土里安详，一半在风里飞扬，一半洒落阴凉，一半沐浴阳光。</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>220</th>\n",
       "      <td>221</td>\n",
       "      <td>……突然觉得……</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26457</th>\n",
       "      <td>26458</td>\n",
       "      <td>目光呆泄、反应迟钝、四肢无力、就差发烧了，自己还美滋滋的说帅的不行！</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26458</th>\n",
       "      <td>26459</td>\n",
       "      <td>每天上班都是相同的路，真的想走不一样的路，有一天下很大的雨，要绕路走，结果那天迷了路......</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26459</th>\n",
       "      <td>26460</td>\n",
       "      <td>转眼间，网络科幻小说《悟空传》已十年了，即使十年后昨晚重读仍让人记忆犹新，最记得玄奘说的一句...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26460</th>\n",
       "      <td>26461</td>\n",
       "      <td>默默等到点开门后，大叔讥笑我，你怎么又睡过头啊。。。</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26461</th>\n",
       "      <td>26462</td>\n",
       "      <td>我国文化传媒的行业细分又进入了一个新的阶段。</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>13537 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         num                                               text  label\n",
       "7          8                          据说济南最低温度24度，可今天青岛最高才21度……    6.0\n",
       "109      110                                              地动山摇！    0.0\n",
       "154      155                   每一个人应该经常问自己，我们为国为家做过些值得为之骄傲的贡献吗？    0.0\n",
       "200      201                    一半在尘土里安详，一半在风里飞扬，一半洒落阴凉，一半沐浴阳光。    0.0\n",
       "220      221                                           ……突然觉得……    0.0\n",
       "...      ...                                                ...    ...\n",
       "26457  26458                 目光呆泄、反应迟钝、四肢无力、就差发烧了，自己还美滋滋的说帅的不行！    2.0\n",
       "26458  26459   每天上班都是相同的路，真的想走不一样的路，有一天下很大的雨，要绕路走，结果那天迷了路......    6.0\n",
       "26459  26460  转眼间，网络科幻小说《悟空传》已十年了，即使十年后昨晚重读仍让人记忆犹新，最记得玄奘说的一句...    0.0\n",
       "26460  26461                         默默等到点开门后，大叔讥笑我，你怎么又睡过头啊。。。    5.0\n",
       "26461  26462                             我国文化传媒的行业细分又进入了一个新的阶段。    0.0\n",
       "\n",
       "[13537 rows x 3 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.duplicated('text')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8bcea035-3a40-4fac-af6e-54a96d354c14",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop_duplicates(subset='text',keep='first',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b398730c-cf77-4c82-a9a3-a7b736597ef6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.duplicated('text').any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "09a224ae-09cb-4dbe-bdb4-b47f12f42ec1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_2012\\459930883.py:1: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.\n",
      "  plt.boxplot(x=df.label,\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhYAAAGKCAYAAABQCwh2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAZQElEQVR4nO3dfZBdZYHn8V+/pG93h+5OyBsk6SQozCTCWMrLziLOrM7M4iqlTO2ua1lgrY5MLTMRUKwaYNWROGKG3ZFld0ZxwS0HzYCpWaXkD8WlrEVlgF0IrlI7hFcxgRBCIOnbeenbb3f/yJCdkyYhnTxJp5PPp+pWbp/nnnOf3O577vecvt3d0mw2mwEAKKB1qicAABw/hAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABTTfrTvcHx8PJs2bUpPT09aWlqO9t0DAIeg2WxmcHAwCxcuTGvr/s9LHPWw2LRpU/r7+4/23QIABWzcuDGLFy/e7/hRD4uenp4keybW29t7tO8eADgE9Xo9/f39e1/H9+eoh8Vr3/7o7e0VFgAwzbzR2xi8eRMAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLoIxaLWlp+f+XWm2qZwRMAWEBHL5aLRkeri4bHhYXcAISFsDh2zcq3mg5cNwSFgBAMcICAChGWACHr6NjcsuB45awAA5fozExIjo69iwHTijtUz0B4DghIoA4YwEAFCQsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDGTCovR0dF89rOfzWmnnZaurq686U1vyhe+8IWMj48fqfkBANNI+2RufOONN+ZrX/tabr/99px55pl55JFH8rGPfSx9fX256qqrjtQcAYBpYlJh8eCDD+biiy/ORRddlCRZtmxZ7rzzzjzyyCNHZHIAwPQyqW+FvPOd78yPfvSjPPnkk0mSn//857n//vvzvve9b7/rNBqN1Ov1ygUAOD5N6ozFNddck4GBgSxfvjxtbW0ZGxvLDTfckA9/+MP7XWf16tVZtWrVYU8UADj2TeqMxdq1a7NmzZrccccdefTRR3P77bfnL/7iL3L77bfvd53rrrsuAwMDey8bN2487EkDAMemlmaz2TzYG/f39+faa6/NypUr9y774he/mDVr1mT9+vUHtY16vZ6+vr4MDAykt7d38jMGAI66g339ntQZi127dqW1tbpKW1ubHzcFAJJM8j0W73//+3PDDTdkyZIlOfPMM/Ozn/0sN910U/7gD/7gSM0PAJhGJvWtkMHBwXzuc5/LXXfdlS1btmThwoX58Ic/nD/90z9NR0fHQW3Dt0IAYPo52NfvSYVFCcICAKafI/IeCwCAAxEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKKZ9qicAHCdaWiYuazaP/jyAKeWMBXD4Xi8qDrQcOG4JCwCgGGEBABQjLACAYoQFAFCMsAAO3/5++sNPhcAJx4+bAmWICCDOWAAABQkLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBiJh0WL7zwQi699NLMmTMn3d3dedvb3pZ169YdibkBANNM+2RuvG3btlxwwQV597vfnR/84AeZP39+nnnmmcyaNesITQ+YNlpaJi5rNo/+PIApNamwuPHGG9Pf359vfOMbe5ctW7as9JyA6eb1ouK15eICTiiT+lbI3XffnXPPPTcf/OAHM3/+/Lz97W/PbbfddsB1Go1G6vV65QIAHJ8mFRbPPvtsbrnllpxxxhn54Q9/mMsvvzxXXnllvvnNb+53ndWrV6evr2/vpb+//7AnDQAcm1qazYM/T9nR0ZFzzz03DzzwwN5lV155ZR5++OE8+OCDr7tOo9FIo9HY+3G9Xk9/f38GBgbS29t7GFMHjhn7+1ZI4lshcJyo1+vp6+t7w9fvSZ2xOPXUU/OWt7ylsmzFihXZsGHDftep1Wrp7e2tXACA49OkwuKCCy7IE088UVn25JNPZunSpUUnBUwz+zsr4WwFnHAmFRaf+tSn8tBDD+VLX/pSnn766dxxxx259dZbs3LlyiM1P2C6aDYnXoATzqTC4rzzzstdd92VO++8M2eddVb+7M/+LDfffHMuueSSIzU/AGAamdSbN0s42Dd/AADHjiPy5k0AgAMRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFNM+1ROAw7Fr166sX79+qqfBP9i9e3eee+65LFu2LF1dXVM9HZIsX7483d3dUz0NTiDCgmlt/fr1Oeecc6Z6GnDMWrduXc4+++ypngYnEGHBtLZ8+fKsW7duqqfBP3j88cdz6aWXZs2aNVmxYsVUT4fseY7A0SQsmNa6u7sdjR2DVqxY4fMCJyhv3gQAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUc1hhsXr16rS0tOSTn/xkoekAANPZIYfFww8/nFtvvTVvfetbS84HAJjGDiksduzYkUsuuSS33XZbZs+eXXpOAMA0dUhhsXLlylx00UX5vd/7vTe8baPRSL1er1wAgONT+2RX+Pa3v51HH300Dz/88EHdfvXq1Vm1atWkJwYATD+TOmOxcePGXHXVVVmzZk06OzsPap3rrrsuAwMDey8bN248pIkCAMe+SZ2xWLduXbZs2ZJzzjln77KxsbH85Cc/yV/91V+l0Wikra2tsk6tVkutViszWwDgmDapsPjd3/3dPPbYY5VlH/vYx7J8+fJcc801E6ICADixTCosenp6ctZZZ1WWzZw5M3PmzJmwHAA48fjNmwBAMZP+qZB93XfffQWmAQAcD5yxAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQzKTCYvXq1TnvvPPS09OT+fPn5/d///fzxBNPHKm5AQDTzKTC4sc//nFWrlyZhx56KPfee29GR0dz4YUXZufOnUdqfgDANNI+mRvfc889lY+/8Y1vZP78+Vm3bl1++7d/u+jEAIDpZ1Jhsa+BgYEkycknn7zf2zQajTQajb0f1+v1w7nLY8aGDRuydevWqZ4GHFMef/zxyr/AHnPnzs2SJUumehpHRUuz2WweyorNZjMXX3xxtm3blp/+9Kf7vd3111+fVatWTVg+MDCQ3t7eQ7nrKbdhw4b8+vIVGdq9a6qnAsA00NnVnSfWPz6t46Jer6evr+8NX78P+YzFJz7xifziF7/I/ffff8DbXXfddbn66qsrE+vv7z/Uuz0mbN26NUO7d+U3/tWnM3Pe9P6/QEljI8MZ2v5SOmctSNuMjqmeDhwTdr68MY9958vZunXrtA6Lg3VIYXHFFVfk7rvvzk9+8pMsXrz4gLet1Wqp1WqHNLlj3cx5/eldePpUTwOOLUvfMtUzAKbQpMKi2WzmiiuuyF133ZX77rsvp5122pGaFwAwDU0qLFauXJk77rgj3/ve99LT05PNmzcnSfr6+tLV1XVEJggATB+T+j0Wt9xySwYGBvKud70rp5566t7L2rVrj9T8AIBpZNLfCgEA2B9/KwQAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLIBiGrWn8tK8L6dRe2qqpwJMEWEBFNFMMwO992Ss6+UM9N6TZppTPSVgCggLoIhG51MZ7XohW+/ZmtGuF9LodNYCTkTCAjhszTRT7/lhdj87lM3f3pzdzw6l3vNDZy3gBCQsgMP22tmKl767OUny0nc3O2sBJyhhARyW185WDG8ZyejgaDqXdmZ0cDTDW0actYATUPtUTwCY7sYy1ro9HfNn5PRVp1dHGtuTjMWuBk4cnu3AYWlJe+ZvvSJjrTsnjLWNn5QWuxk4oXjGA4etbWxW2sZmTfU0gGOA91gAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoJhDCouvfvWrOe2009LZ2ZlzzjknP/3pT0vPCwCYhiYdFmvXrs0nP/nJfOYzn8nPfvaz/NZv/Vbe+973ZsOGDUdifgDANDLpsLjpppvy8Y9/PJdddllWrFiRm2++Of39/bnllluOxPwAgGmkfTI3Hh4ezrp163LttddWll944YV54IEHXnedRqORRqOx9+N6vX4I0zz2nDIrWTLySDoHNu5dNjjanRcbc9LRMpJl3ZsnrPPkzv4kyZKul9LZOlwZ29w4OfXRmZk1Y0fmd2yrjO0a68zzQ/PSmvGcPvOFCdt9ZtfCjDXbsqhza2a27a6MvTw8K9tGetLTviun1l6pjDXGZ+RXu09JkvzazOeTNCvjv9q9II3xjpxSezW97TsrY9tGevLy8Kx0tw1lcefLlbGxZlue2bUwSfLm7k1paxmrjD8/NC+7xjozr2N7Zs8YrIzVR2dmc+Pk1FqHs7TrpX3+py15cufiJMnSrs2ptY5kfHQkwzte3bPdnX0ZHOnMybWdWdC1o7Lm4Egtz++clfaWsZzRt3XfhzBPbJ+X8bRmycxtmTljn8/Nrp5sG+5O74zdWTSz+vW7e3RGnttxcpJkxax955s8U5+T4fH2LOoeSG/HUGXs5aGZ2Tp0Uma2N7LkpO2VseGxtjwzODdJckbvy2lvHa+MPzc4O7vHOrKgazAn13ZVxrY1urJ5d28620ZyWs+rlbHxZkueGJifJHlTzyuptY1Wxl97DOfUdmb+Po9hfbiWF3bt/zFcv31+mmnJ0pNeTXf7SGXsxV092T7cnVkdu3Jqd/Vzvmt0Rn614+S0pJnls7ZM2O5TA3Mz2mzLou7t6e1oVMa27D4przRmpmfGUBbPHKiMNcba8+zgnCTJr/dtSWtL9ev7l4MnZ2hsRk7pqmd2rfq8ebXRnZd296SrbTjLeqrPx9Hx1jxVn5ckeXPP1nS0Vb++N+yYlZ2jtczt3JF5ndXnTX24My/s6ktH62je3Ft9PibJ49sXJEmWnfRquvZ5DF/Y2Zv6SFdmd+zKKfs8hjtHOrJh5+y0Zjy/Pqv6fOw46eT8cnjpCbuP+MdebMzJ4Gh3Zs8YzLyO7ZWxnWNdeWFobtpaxvLm7k3Z19M7F2U8rVnc+XK626rP5S3Ds7N95KT0tu/MKbXqc25ovCMvvlzdpxzvJhUWW7duzdjYWBYsWFBZvmDBgmzePPGFNElWr16dVatWHfoMj0Fz587Nyve057O/s6ayfM39yUe+lrx5QfLjmyau13LJnn9vuj45/4zq2KVfTf7H3yV//M+Tr3y0OvbDXyT/4uakpyv5u69P3O68y5Otg8n3rk4+cHZ17Oo1yX/6QfKv/0ly+1XVsUd/mZzz2T3X7/vrpDajOn7mnyR//0Jy22XJZe+ujq2+O/n3a5N/tiK5/bPVsedfTfqv2HP9b/8yWXxydfxdX0weejz50oeS6z5QHfv6/0z+8OvJWxYl9/2H6lhjJOn86J7rX/licvZp1fEP/ufkv//v5FPvTW66qDp297rk4m8mc3uSR7+WCXovSwZ3J//xmuQ9b62Orfzr5Kv3JpdckKz54+rYg08l77h+z/Vf/M3E7Z5+dfLMS8mf/FFy6TurY9d/J1n13eTC30jWVls9T29Ozvj0nus/uiWZ11sdP//zyWNPJx+9JLn6d6pjX7k3+cR3krcvS9beUB2r7076Lttz/c4bkzMXV8c/8OXkgUeTaz+QrN7nMfzb/5X8m28li05OHv3Lif/X2r9NhkeT//KZ5F1vqY5ddlvy3+5LPv6u5Ot/WB277++Td9+QdLQnP7994nYXX5G88GryuSuTD/5mdey6tcmf3528/+xk7aerY//3+eSsa/Zcv//rSW9XdfzszySPPZf8u48mK/d5DG/6fvLp7yT/9PRk7T67r5fryfw/2nP9u19OTj+lOv6eP08eeiz5/L9Mrn9fdWzN/clHvrVnH7H2APuI/3r96+8j/uZA+4gb9+wj/o99RJL97yPufW0fcWl17O51ycVf27OPuP8A+4h73mAfcfvr7SO+k3R2dWfu3LkTN3wcamk2m803vtkemzZtyqJFi/LAAw/k/PPP37v8hhtuyLe+9a2sX79+wjqvd8aiv78/AwMD6e3tnXD76eL5px/O4JYnK8vGWnsz3LEoLeONdA4/O2Gd3Z0rkiS1xnNpbVaPGoZnLMxYW1/aR7dlxmg10sZbZ6bRsSRpjqWrUb3PJNldOz1pmZGO4Y1pG68eZY60z89o+5y0jdXTMVI929Fs6cxQbc8zr2toffY9GhnqOC3N1s50jLyYtrHtlbHRtjkZmTE/rWM7Uxupvr+mmfYMde7ZK3YOPZWWVI+KGzOWZLxtZmaMbEn7WPUIaaxtVoZnnJqW8aF0Dv9yn/9pS3Z3Lt+z3cYv09IcSmOokU0v7jm62Jn5Gc7MdGYgXakeNYykOzuyIC0ZzaxszL62Z2maaU1PNqc91c/NrsxJI73pyI7MTPXIazS1DGbPkdfs7DvfZCCLMp6OzMyWdKR6RLc7szKU2WnPrvSkeuQ1nhkZyJ5X/VnZkJZUj+gGc2pG05muvJLOVM+iNNKTXZmbtjTSm32PvFqzLUuTJL15IW2pHkntyPyMZGY6sz1dqR6pj2RmdmR+WjOavtd5DPdstzU9eTHtqR7R7crcNNKTWgbTnerZjtF0ZjCnJhnP7PxqwnYH0p/xtOekbMmMCY/h7AxlVmZkZ05K9WzHWDpSz6Ik+YftVs/61LMwY6mlO1tTS/WoeCi92Z05ac9QevJiZayZtmzPkiRJX55Pa6pHxYNZkNF0pzPb0pXtlbHhzMzOzE9rhtOXiWcft2XP87Enm9Ke6tmZnZmX4ZyUWurpTvV5M5quDOaUtGQ8s/Z5DBeeujDjfWeesPuIf2x4xqKMtfWmffSVzBjd5+ul9aQMd/QnzZF0NZ7OvnbXfi1paUtteENax6tfhyPtp2S0fXbaxgbSMVJ9zo23dKVRW5a5c+dmyZIlE7Y7ndTr9fT19b3h6/ekzljMnTs3bW1tE85ObNmyZcJZjNfUarXUarXJ3M20sPj085LTzzvALc4/wNjZBxh7Iwe6z8PZ7uGse6S2+46D2u7bDuMe4MRyYu4jJu83DzB2pB6H48ek3rzZ0dGRc845J/fee29l+b333pt3vONAn2AA4EQwqTMWSXL11VfnIx/5SM4999ycf/75ufXWW7Nhw4ZcfvnlR2J+AMA0Mumw+NCHPpRXXnklX/jCF/Liiy/mrLPOyve///0sXbr0SMwPAJhGJvXmzRIO9s0fAMCx42Bfv/2tEACgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoJhJ/0rvw/XaL/qs1+tvcEsA4Fjx2uv2G/3C7qMeFoODg0mS/v7+o33XAMBhGhwcTF9f337Hj/rfChkfH8+mTZvS09OTlpaWo3nXwBFWr9fT39+fjRs3+ltAcJxpNpsZHBzMwoUL09q6/3dSHPWwAI5f/sgg4M2bAEAxwgIAKEZYAMXUarV8/vOfT61Wm+qpAFPEeywAgGKcsQAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQzP8DJebDEUrrw+IAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.boxplot(x=df.label,\n",
    "           whis=1.5,\n",
    "           widths=0.8,\n",
    "           patch_artist=True,\n",
    "           showmeans=True,\n",
    "           boxprops={'facecolor':'steelblue'},\n",
    "           flierprops={'markerfacecolor':'red','markeredgecolor':'red','markersize':4},\n",
    "           meanprops={'markerfacecolor':'D','markerfacecolor':'black','markersize':4},\n",
    "           medianprops={'linestyle':'--','color':'orange'},\n",
    "           labels=[''])\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3fb759fa-bbc5-4de6-a296-5212260b346f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
